---

Data Analysis Assignment

Data Exploration and Visualization

Lorenzo Rodriguez

LBS MFA 2024

---

#loading data

rats <- vroom::vroom(here::here("ca09.mfa2024/data/Rat_Sightings_20231015.csv"), 
                     na = c("", "NA", "N/A")) %>% 
  janitor::clean_names() %>% 
  mutate(
    # convert the string to a date format
    created_date = mdy_hms(created_date),
    
    # just the date without hour info
    sighting_date = as.Date(created_date), 
    
    # get the year, month, etc
    sighting_year = year(created_date),
    sighting_month = month(created_date),
    sighting_month_name = month(created_date, label = TRUE, abbr = FALSE),
    sighting_day = day(created_date),
    sighting_weekday = wday(created_date, label = TRUE, abbr = FALSE)) %>%
  filter(borough != "Unspecified") %>%  
  mutate(borough = str_to_title(borough))

#loading the nyc_weather dataset

nyc_weather <- vroom::vroom(here::here("ca09.mfa2024/data/nyc_weather.csv"), 
                     na = c("", "NA", "N/A")) %>% 
  janitor::clean_names()%>%
  mutate(
    # convert the string to a date format
    date = dmy(datetime),
    
    # get the year, month, etc
    year = year(date),
    month = month(date),
    month_name = month(date, label = TRUE, abbr = FALSE),
    day = day(date),
    weekday = wday(date, label = TRUE, abbr = FALSE))
  1. Exploratory Data Analysis

#looking at the laoded data

glimpse(rats) 
## Rows: 230,028
## Columns: 44
## $ unique_key                     <dbl> 56242499, 56242808, 57383018, 56610428,…
## $ created_date                   <dttm> 2022-12-13 18:52:10, 2022-12-13 14:28:…
## $ closed_date                    <chr> "12/13/2022 06:52:10 PM", "12/13/2022 0…
## $ agency                         <chr> "DOHMH", "DOHMH", "DOHMH", "DOHMH", "DO…
## $ agency_name                    <chr> "Department of Health and Mental Hygien…
## $ complaint_type                 <chr> "Rodent", "Rodent", "Rodent", "Rodent",…
## $ descriptor                     <chr> "Rat Sighting", "Rat Sighting", "Rat Si…
## $ location_type                  <chr> "3+ Family Apt. Building", "Other (Expl…
## $ incident_zip                   <dbl> 10472, 10025, 10026, 11204, 10451, 1122…
## $ incident_address               <chr> "1259 BRONX RIVER AVENUE", "741 WEST EN…
## $ street_name                    <chr> "BRONX RIVER AVENUE", "WEST END AVENUE"…
## $ cross_street_1                 <chr> "COLGATE AVENUE", "WEST 96 STREET", "WE…
## $ cross_street_2                 <chr> "EAST 172 STREET", "WEST 97 STREET", "W…
## $ intersection_street_1          <chr> "COLGATE AVENUE", "WEST   96 STREET", "…
## $ intersection_street_2          <chr> "EAST  172 STREET", "WEST   97 STREET",…
## $ address_type                   <chr> "ADDRESS", "ADDRESS", "ADDRESS", "INTER…
## $ city                           <chr> "BRONX", "NEW YORK", "NEW YORK", "BROOK…
## $ landmark                       <chr> "BRONX RIVER AVENUE", "WEST END AVENUE"…
## $ facility_type                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ status                         <chr> "Closed", "Closed", "Closed", "Closed",…
## $ due_date                       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ resolution_action_updated_date <chr> "12/13/2022 06:52:10 PM", "12/13/2022 0…
## $ community_board                <chr> "09 BRONX", "07 MANHATTAN", "10 MANHATT…
## $ borough                        <chr> "Bronx", "Manhattan", "Manhattan", "Bro…
## $ x_coordinate_state_plane       <dbl> 1016821, 991701, 997727, 989367, 100845…
## $ y_coordinate_state_plane       <dbl> 241766, 229075, 232511, 167979, 239668,…
## $ park_facility_name             <chr> "Unspecified", "Unspecified", "Unspecif…
## $ park_borough                   <chr> "BRONX", "MANHATTAN", "MANHATTAN", "BRO…
## $ vehicle_type                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ taxi_company_borough           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ taxi_pick_up_location          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ bridge_highway_name            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ bridge_highway_direction       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ road_ramp                      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ bridge_highway_segment         <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ latitude                       <dbl> 40.8, 40.8, 40.8, 40.6, 40.8, 40.7, 40.…
## $ longitude                      <dbl> -73.9, -74.0, -74.0, -74.0, -73.9, -73.…
## $ location                       <chr> "(40.83020720581248, -73.88230434536744…
## $ sighting_date                  <date> 2022-12-13, 2022-12-13, 2023-04-21, 20…
## $ sighting_year                  <dbl> 2022, 2022, 2023, 2023, 2022, 2023, 202…
## $ sighting_month                 <dbl> 12, 12, 4, 1, 12, 6, 8, 2, 3, 3, 7, 7, …
## $ sighting_month_name            <ord> December, December, April, January, Dec…
## $ sighting_day                   <int> 13, 13, 21, 24, 13, 30, 14, 24, 22, 22,…
## $ sighting_weekday               <ord> Tuesday, Tuesday, Friday, Tuesday, Tues…
skim(rats)
Data summary
Name rats
Number of rows 230028
Number of columns 44
_______________________
Column type frequency:
character 23
Date 1
factor 2
logical 8
numeric 9
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
closed_date 17910 0.92 22 22 0 102336 0
agency 0 1.00 5 5 0 1 0
agency_name 0 1.00 39 39 0 1 0
complaint_type 0 1.00 6 6 0 1 0
descriptor 0 1.00 12 12 0 1 0
location_type 12 1.00 5 29 0 49 0
incident_address 10926 0.95 2 52 0 104381 0
street_name 10927 0.95 2 49 0 8336 0
cross_street_1 26355 0.89 2 35 0 6756 0
cross_street_2 26348 0.89 2 35 0 6847 0
intersection_street_1 118251 0.49 2 32 0 5782 0
intersection_street_2 118215 0.49 3 32 0 5868 0
address_type 4752 0.98 7 12 0 6 0
city 3647 0.98 5 19 0 102 0
landmark 138598 0.40 3 32 0 4074 0
status 0 1.00 4 11 0 7 0
due_date 97803 0.57 22 22 0 132168 0
resolution_action_updated_date 9533 0.96 22 22 0 140301 0
community_board 0 1.00 8 25 0 73 0
borough 0 1.00 5 13 0 5 0
park_facility_name 0 1.00 11 11 0 1 0
park_borough 0 1.00 5 13 0 5 0
location 2245 0.99 26 40 0 114670 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
sighting_date 0 1 2010-01-01 2023-10-14 2018-07-03 5034

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sighting_month_name 0 1 TRUE 12 Jul: 25811, Aug: 25551, Jun: 24694, May: 23633
sighting_weekday 0 1 TRUE 7 Mon: 39123, Tue: 38783, Wed: 38196, Thu: 36679

Variable type: logical

skim_variable n_missing complete_rate mean count
facility_type 230028 0 NaN :
vehicle_type 230028 0 NaN :
taxi_company_borough 230028 0 NaN :
taxi_pick_up_location 230028 0 NaN :
bridge_highway_name 230028 0 NaN :
bridge_highway_direction 230028 0 NaN :
road_ramp 230028 0 NaN :
bridge_highway_segment 230028 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
unique_key 0 1.00 4.00e+07 1.23e+07 1.15e+07 3.03e+07 3.96e+07 5.16e+07 5.91e+07 ▂▅▆▅▇
incident_zip 419 1.00 1.07e+04 5.89e+02 8.30e+01 1.01e+04 1.11e+04 1.12e+04 1.00e+05 ▇▁▁▁▁
x_coordinate_state_plane 2245 0.99 1.00e+06 1.84e+04 9.13e+05 9.93e+05 1.00e+06 1.01e+06 1.07e+06 ▁▁▇▅▁
y_coordinate_state_plane 2243 0.99 2.08e+05 2.92e+04 1.21e+05 1.87e+05 2.03e+05 2.33e+05 2.72e+05 ▁▃▇▅▃
latitude 2245 0.99 4.07e+01 8.00e-02 4.05e+01 4.07e+01 4.07e+01 4.08e+01 4.09e+01 ▁▃▇▅▃
longitude 2245 0.99 -7.39e+01 7.00e-02 -7.42e+01 -7.40e+01 -7.39e+01 -7.39e+01 -7.37e+01 ▁▁▇▅▁
sighting_year 0 1.00 2.02e+03 3.90e+00 2.01e+03 2.02e+03 2.02e+03 2.02e+03 2.02e+03 ▃▅▃▆▇
sighting_month 0 1.00 6.56e+00 3.04e+00 1.00e+00 4.00e+00 7.00e+00 9.00e+00 1.20e+01 ▇▇▇▇▇
sighting_day 0 1.00 1.57e+01 8.73e+00 1.00e+00 8.00e+00 1.60e+01 2.30e+01 3.10e+01 ▇▇▇▇▆

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
created_date 0 1 2010-01-01 08:29:58 2023-10-14 01:55:50 2018-07-03 113481

#Plotting the data

# See the count of rat sightings by weekday
rats %>%
  count(sighting_weekday) %>% 
  kable()
sighting_weekday n
Sunday 22529
Monday 39123
Tuesday 38783
Wednesday 38196
Thursday 36679
Friday 32558
Saturday 22160
# Assign a summarixed data frame to an object to use it in a plot
rats_by_weekday <- rats %>%
  count(sighting_weekday, sighting_year) 


ggplot(rats_by_weekday, aes(x = fct_rev(sighting_weekday), y = n)) + 
  geom_col() +
  coord_flip() +
  facet_wrap(~ sighting_year)

# See the count of rat sightings by weekday and borough
rats %>%
  count(sighting_weekday, borough, sighting_year) %>% 
  head(10) %>% 
  kable()
sighting_weekday borough sighting_year n
Sunday Bronx 2010 159
Sunday Bronx 2011 145
Sunday Bronx 2012 181
Sunday Bronx 2013 166
Sunday Bronx 2014 224
Sunday Bronx 2015 270
Sunday Bronx 2016 314
Sunday Bronx 2017 357
Sunday Bronx 2018 274
Sunday Bronx 2019 242
# An alternative to count() is to specify the groups with group_by() and then
# be explicit about how you're summarising the groups, such as calculating the
# mean, standard deviation, or number of observations (we do that here with
# `n()`).
# rats %>%
#   group_by(sighting_weekday, borough) %>%
#   summarise(n = n())

The above graph showcases the rat sightings based on the day during the years 2015-2023. On closer inspection we can see that while the sightings have increased over the years, it peaks at the beginning of the week, that is there are more sightings on mondays, tuesdays and wednesdays, and this pattern continues over the years.

#Scatter Plot for sightings per day

rats %>%
  group_by(sighting_date)%>%
  summarise(sighting_per_day = n())%>%
  ggplot(aes(x = sighting_date, y = sighting_per_day ))+
  geom_point(alpha = 0.5)+
  labs(title = "Number of Rat sightings in NYC per Day",
       x = "Days",
       y = "Number of Rat Sightings")

The graph presented above illustrates the daily variations in rat sightings. It provides a valuable insight into a recurring pattern in rat sightings, wherein the numbers tend to rise up to a certain date and then decline again, only to repeat this pattern annually.

Upon closer examination, it appears that rat sightings exhibit a seasonal trend. Specifically, we observe an increase in sightings during the warmer months, typically associated with summer, while during the colder, winter months, when temperatures drop, there is a noticeable decrease. This aligns with our initial hypothesis that rat sightings tend to diminish during winter, as the cold weather often prompts rodents to enter a hibernation-like state, reducing their visibility, and then resurge during the warmer months.

#Bar chart showing Number of Rat sightings in NYC by Month 

rats %>%
    group_by(sighting_month_name) %>%
    summarise(monthly_count = n()) %>%
    ggplot(aes(x= sighting_month_name, y = monthly_count)) +
    geom_col()+
    labs(title = paste("Number of Rat sightings in NYC by Month"),
       x = "Month",
       y = "Number of Rat Sightings")

The above graph shows us the sightings per month, and as visible from above we understand that there are more sightings over the summer, that is may to august as opposed to the winter months.

#Bar chart showing Number of Rat sightings in NYC by Year

rats %>%
  filter(sighting_year < 2023) %>% #filterng 2023 because of incomplete data points
  group_by(sighting_year) %>%
  summarise(yearly_count = n()) %>%
  ggplot(aes(x= sighting_year, y = yearly_count))+
  geom_col()+
  labs(title = "Number of Rat sightings in NYC by Year",
       x = "Year",
       y = "Number of Rat Sightings")

The above graph shows the rat sightings per year, and as already viewed previously, we see a steady increase in the sightings. Between years 2017-2020, however, the trend has reversed with the sightings decreasing, but 2021 and 2022 have seen a large increase post 2020.

We hypothesise that these sightings can also be linked to the number of people in the area, for example 2020 was the year of COVID-19 and with lockdowns in place, the fall in sightings could be because of the number of people outside.

#Function for line chart showing Number of Rat sightings in NYC by chosen Year

sighting_by_month_in_year <- function(sighting_year) {
  rats %>%
    filter(sighting_year == {{sighting_year}}) %>%
    group_by(sighting_month_name) %>%
    summarise(monthly_count = n()) %>%
    ggplot(aes(x= sighting_month_name, y = monthly_count, group = 1)) +
    geom_col()+
    labs(title = paste("Number of Rat sightings in NYC by Month in Year", sighting_year),
       x = "Month",
       y = "Number of Rat Sightings")
}

sighting_by_month_in_year(2022)

The above graphs show that the most sightings have taken place in Brooklyn and Manhattan, while the Staten Island has seen the least number of rats.

#mapping the data

# let's get the top 7 location types, which account for > 90% of all cases
# this code generates a vector with the top 7 location types
top_location_types <- rats %>%
  count(location_type, sort=TRUE) %>%
  mutate(perc = 100*n/sum(n)) %>%
  slice(1:7) %>%
  select(location_type) %>%
  pull()

# lets us choose how to colour each point. What palette and colours to use? 
# A great site to get the relevant color hex codes for maps is 
# https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=7
my_colours <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628')

# create a function point_fill, that assigns `my_colours` to different location types
# you can read more here https://rstudio.github.io/leaflet/colors.html
point_fill <- colorFactor(palette = my_colours,  
                          rats$location_type)



rats %>%
  filter(sighting_year == 2022) %>% # just show 2022 data
  filter(location_type %in% top_location_types) %>%
  leaflet() %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   color = ~point_fill(location_type), 
                   fillOpacity = 0.6, 
                   popup = ~created_date,
                   label = ~location_type) %>%
  
  addLegend("bottomright", pal = point_fill, 
            values = ~location_type,
            title = "2022 Location Type",
            opacity = 0.5)

#rat sightings vs a borough’s human population

# https://en.wikipedia.org/wiki/Boroughs_of_New_York_City
# NYC Boroughs, 2020 census data
# Bronx  1,472,654  
# Brooklyn 2,736,074    
# Manhattan 1,694,263   
# Queens 2,405,464  
# Staten Island  495,747

nyc_population <- tibble(
  borough = c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island"),
  population = c(1472654,2736074,1694263,2405464,495747)) 
library(ggtext)

# your ggplot code +

  labs(
    title = "Mouse sightings don't always track a <span style='color:#FFA500'><b>borough's population</span></b>"
  ) +

  theme(
    plot.title.position = "plot",
    plot.title = element_textbox_simple(size=16)) +
  NULL
## integer(0)
# Calculate population proportion for each borough
nyc_population <- nyc_population %>% 
  mutate(population_proportion = population / sum(population))

# Subset the data for the years 2014-2022
filtered_rats <- rats %>%
  filter(sighting_year >= 2014 & sighting_year <= 2022)
# Calculate total sightings each year
total_sightings_by_year <- filtered_rats %>%
  group_by(sighting_year)%>%
  summarise(total_sightings_by_year = n())
# Combine three datasets and calculate proportions and CI for each borough and year
rat_sightings_by_borough_by_year <- filtered_rats %>%
  group_by(borough, sighting_year) %>%
  summarise(total_sightings_by_borough_by_year = n()) %>%
  left_join(nyc_population, by = "borough") %>%
  left_join(total_sightings_by_year, by = "sighting_year") %>%
  mutate(
    proportion = total_sightings_by_borough_by_year / total_sightings_by_year,
    # Calculate the Margin of Error
    margin_of_error = 1.96 * sqrt(proportion * (1 - proportion) / total_sightings_by_borough_by_year),
    lower_CI = proportion - margin_of_error,
    upper_CI = proportion + margin_of_error,
  )%>%
  group_by(sighting_year) %>%
  arrange(population_proportion)
# Plot the graph
ggplot(rat_sightings_by_borough_by_year, aes(y = reorder(borough, -desc(population_proportion)), x = proportion, xmin = lower_CI, xmax = upper_CI)) +
  geom_errorbar(width = 0.2) +
  geom_point(aes(x = population_proportion), size = 5, color = "#FFA500") +
  facet_wrap(~sighting_year, scales = "free_y")+
  labs(
    title = "Mouse sightings don't always track a <span style='color:#FFA500'><b>borough's population</span></b>",
    x = NULL,
    y = NULL
  ) +
  scale_x_continuous(labels = scales::percent)+
  theme(panel.grid.major = element_line(colour = "grey"),
    panel.grid.minor = element_line(colour = "grey"),
    panel.background = element_rect(fill = "white"),
    plot.background = element_rect(fill = "white"))+
  theme(
    plot.title.position = "plot",
    plot.title = element_textbox_simple(size=16)) +
  NULL

library(tidyquant)

sp500 <- "SPY" %>% 
  tq_get(get  = "stock.prices",
         from = "2010-01-01",
         to = Sys.Date())  # today's date

sp500_returns_daily <- sp500 %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "daily", 
               type       = "log",
               col_rename = "daily_return")  
rats %>%
  group_by(borough, sighting_date) %>%
  summarise(count = n()) %>%
  left_join(sp500_returns_daily, 
          by = c("sighting_date" = "date")) %>%
  mutate(sp500_up_down = ifelse(daily_return > 0, "up", "down")) %>%
  group_by(borough, sp500_up_down) %>%
  summarise(
    mean_n = mean(count),
    sd_n = sd(count),
    count = n(),
    se_n = sd_n / sqrt(count),
    lower_95 = mean_n - (1.96 * se_n),
    upper_95 = mean_n + (1.96 * se_n))
## # A tibble: 15 × 8
## # Groups:   borough [5]
##    borough       sp500_up_down mean_n  sd_n count   se_n lower_95 upper_95
##    <chr>         <chr>          <dbl> <dbl> <int>  <dbl>    <dbl>    <dbl>
##  1 Bronx         down            9.83  4.83  1555 0.123      9.59    10.1 
##  2 Bronx         up              9.53  4.86  1902 0.111      9.31     9.75
##  3 Bronx         <NA>            5.59  3.43  1528 0.0878     5.42     5.77
##  4 Brooklyn      down           19.3  10.8   1565 0.274     18.8     19.9 
##  5 Brooklyn      up             18.9  10.9   1901 0.251     18.4     19.4 
##  6 Brooklyn      <NA>           11.2   7.17  1554 0.182     10.8     11.5 
##  7 Manhattan     down           14.0   7.41  1564 0.187     13.6     14.3 
##  8 Manhattan     up             13.3   7.05  1897 0.162     13.0     13.7 
##  9 Manhattan     <NA>            8.80  5.54  1545 0.141      8.53     9.08
## 10 Queens        down            8.11  4.93  1540 0.126      7.86     8.36
## 11 Queens        up              7.95  4.99  1880 0.115      7.73     8.18
## 12 Queens        <NA>            4.86  3.38  1463 0.0883     4.69     5.03
## 13 Staten Island down            2.69  1.86  1242 0.0529     2.59     2.79
## 14 Staten Island up              2.69  1.83  1490 0.0474     2.60     2.78
## 15 Staten Island <NA>            2.00  1.43   935 0.0469     1.90     2.09

#4. Regression

#Investigate distribution of rat sightings, independent of borough
rats_by_day_noboroughs <- rats %>% 
  group_by(sighting_date) %>% 
  summarise(nr_of_sightings = n())
#Distribution of sightings
ggplot(rats_by_day_noboroughs, aes(x=nr_of_sightings)) + 
  geom_histogram(binwidth = 10)

#Distribution of log of rat sightings
ggplot(rats_by_day_noboroughs, aes(x=log(nr_of_sightings))) + 
  geom_histogram(binwidth = 0.1)

Log helps normalise the data and takes into account non-linear data. Therefore, we’ll use this data going forward.

#Summarise the rat sightings by day
rats_by_day <- rats %>% 
  group_by(borough, sighting_date) %>% 
  summarise(nr_of_sightings = n())

#Joining the rat sightings data with the weather data
rats_weather <- left_join(rats_by_day, nyc_weather, by = c("sighting_date" = "date"))
# import weather dataset
weather <- read_csv(here::here("ca09.mfa2024/data", "nyc_weather.csv"), na = "NA", show_col_types = FALSE)

# modify date variable using lubridate
weather$datetime <- lubridate::dmy(weather$datetime)
# creating the dataset for the single-variable regression

# join rats table with weather data
nyc <-
  left_join(rats, weather, c("sighting_date"="datetime"))

# count sightings per day
nyc_sighting <- nyc %>%
  group_by(sighting_date) %>%
  summarise(sighting_count = n())

# select only unique values in the weather dataset to remove 
nyc_temp <- nyc %>%
  select(sighting_date, temp) %>%
  distinct()

# join temprature and sighting per day
nyc_simple_reg <-
  left_join(nyc_sighting, nyc_temp, by="sighting_date")
  #group_by(sighting_date)

glimpse(nyc_simple_reg)
## Rows: 5,034
## Columns: 3
## $ sighting_date  <date> 2010-01-01, 2010-01-02, 2010-01-03, 2010-01-04, 2010-0…
## $ sighting_count <int> 9, 12, 3, 24, 14, 21, 15, 13, 10, 7, 25, 16, 24, 15, 19…
## $ temp           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
nyc_w = nyc_weather %>% 
  mutate(sighting_date = datetime)
rats_sighted = rats %>% 
  group_by(sighting_date, borough, sighting_weekday, sighting_month_name, 
           sighting_day,
           sighting_year) %>% 
  summarize(mouse_sighted = n(),)
## `summarise()` has grouped output by 'sighting_date', 'borough',
## 'sighting_weekday', 'sighting_month_name', 'sighting_day'. You can override
## using the `.groups` argument.
lm_data = left_join(rats_sighted, nyc_w, by="sighting_date")
lm_data_na_rm = na.omit(data.frame(lm_data) %>% 
  dplyr::select(sighting_date, mouse_sighted, borough, sighting_year, sighting_day,
         sighting_weekday, sighting_month_name, temp, humidity, 
         visibility, precip))
lm_data_na_rm_log = lm_data_na_rm %>% 
  mutate (log_times = log(mouse_sighted)) %>% 
  mutate (sighting_weekday = as.character(sighting_weekday)) %>% 
  mutate (sighting_month_name = as.character(sighting_month_name))

head(lm_data_na_rm_log)
##   sighting_date mouse_sighted   borough sighting_year sighting_day
## 1    2010-01-01             2     Bronx          2010            1
## 2    2010-01-01             4  Brooklyn          2010            1
## 3    2010-01-01             1 Manhattan          2010            1
## 4    2010-01-01             2    Queens          2010            1
## 5    2010-01-02             3     Bronx          2010            2
## 6    2010-01-02             4  Brooklyn          2010            2
##   sighting_weekday sighting_month_name temp humidity visibility precip
## 1           Friday             January  2.4     82.5       10.6   2.40
## 2           Friday             January  2.4     82.5       10.6   2.40
## 3           Friday             January  2.4     82.5       10.6   2.40
## 4           Friday             January  2.4     82.5       10.6   2.40
## 5         Saturday             January -3.5     60.2       14.3   0.18
## 6         Saturday             January -3.5     60.2       14.3   0.18
##   log_times
## 1     0.693
## 2     1.386
## 3     0.000
## 4     0.693
## 5     1.099
## 6     1.386
lm_data_na_rm_log %>% 
  ggplot(aes(x = mouse_sighted, fill = "red", col = "blue"))+
  geom_histogram(binwidth = 1)+
  theme_bw()+
  labs (x = "Number of Mouse Sighted",
        y = "Count",
        title = "Disrtibution of Mouse Sighted (in each location on each day)",
        subtitle = paste("Skewness = ",
                         round(skewness(lm_data_na_rm_log$mouse_sighted),4),
                         sep =""))+
  theme(legend.position = "none")

lm_data_na_rm_log %>% 
  ggplot(aes(x = log_times, fill="red"))+
  geom_histogram(binwidth=0.1)+
  theme_bw()+
  labs (x = "Number of Logged Mouse Sighted",
        y = "Count",
        title = "Disrtibution of Logged Mouse Sighted (in each location on each day)",
        subtitle = paste("Skewness = ",
                         round(skewness(lm_data_na_rm_log$log_times),4),
                         sep =""))+
  theme(legend.position = "none")

model1 = lm(data = lm_data_na_rm_log, log_times~temp)
msummary(model1)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.5197     0.0117   129.4   <2e-16 ***
## temp          0.0246     0.0007    35.2   <2e-16 ***
## 
## Residual standard error: 0.876 on 17376 degrees of freedom
## Multiple R-squared:  0.0664, Adjusted R-squared:  0.0664 
## F-statistic: 1.24e+03 on 1 and 17376 DF,  p-value: <2e-16
model2 = lm(data = lm_data_na_rm_log, log_times ~ temp + borough)
msummary(model2)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.546810   0.012695   121.8   <2e-16 ***
## temp                  0.028700   0.000516    55.6   <2e-16 ***
## boroughBrooklyn       0.547974   0.014970    36.6   <2e-16 ***
## boroughManhattan      0.302432   0.014982    20.2   <2e-16 ***
## boroughQueens        -0.325024   0.015103   -21.5   <2e-16 ***
## boroughStaten Island -1.287073   0.016384   -78.5   <2e-16 ***
## 
## Residual standard error: 0.644 on 17372 degrees of freedom
## Multiple R-squared:  0.495,  Adjusted R-squared:  0.495 
## F-statistic: 3.41e+03 on 5 and 17372 DF,  p-value: <2e-16
model_n = lm(data = lm_data_na_rm_log, mouse_sighted~temp + borough)

msummary(model_n)
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.13049    0.12287    41.8   <2e-16 ***
## temp                  0.24074    0.00499    48.2   <2e-16 ***
## boroughBrooklyn       6.84463    0.14489    47.2   <2e-16 ***
## boroughManhattan      3.15229    0.14500    21.7   <2e-16 ***
## boroughQueens        -1.98048    0.14617   -13.6   <2e-16 ***
## boroughStaten Island -6.36670    0.15857   -40.1   <2e-16 ***
## 
## Residual standard error: 6.23 on 17372 degrees of freedom
## Multiple R-squared:  0.37,   Adjusted R-squared:  0.369 
## F-statistic: 2.04e+03 on 5 and 17372 DF,  p-value: <2e-16
performance::check_model(model1)

performance::check_model(model2)

car::vif(model2)
##         GVIF Df GVIF^(1/(2*Df))
## temp       1  1               1
## borough    1  4               1
library(huxtable)
summary_table = huxtable::huxreg(model1, model2,
                 statistics = c('No. observations' = 'nobs',
                                'R^2' =  'r.squared',
                                "Adj. R^2" = "adj.r.squared"),
                 bold_signif = 0.05) %>% 
  set_caption("Two-Model Result")
summary_table
Two-Model Result
(1)(2)
(Intercept)1.520 ***1.547 ***
(0.012)   (0.013)   
temp0.025 ***0.029 ***
(0.001)   (0.001)   
boroughBrooklyn        0.548 ***
        (0.015)   
boroughManhattan        0.302 ***
        (0.015)   
boroughQueens        -0.325 ***
        (0.015)   
boroughStaten Island        -1.287 ***
        (0.016)   
No. observations17378        17378        
R^20.066    0.495    
Adj. R^20.066    0.495    
*** p < 0.001; ** p < 0.01; * p < 0.05.